last updated: 2023-08-18
As usual, make sure we have the right packages for this exercise
if (!require("pacman")) install.packages("pacman"); library(pacman)
## Loading required package: pacman
# let's load all of the files we were using and want to have again today
p_load("tidyverse", "knitr", "readr",
"pander", "BiocManager",
"dplyr", "stringr",
"purrr", # for working with lists (beautify column names)
"reactable") # for pretty tables.
# We also need these Bioconductor packages today.
p_load("edgeR", "AnnotationDbi", "org.Sc.sgd.db")
This will be our first differential expression analysis workflow, converting gene counts across samples into meaningful information about genes that appear to be significantly differentially expressed between samples
At the end of this exercise, you should be able to:
library(edgeR)
library(org.Sc.sgd.db)
# for ease of use, set max number of digits after decimal
options(digits=3)
We saved this file in the last exercise (Read_Counting.Rmd) from the
RSubread package. Now we can load that object back in and assign it to
the variable fc. Be sure to change the file path if you
have saved it in a different location.
path_fc_object <- path.expand("~/Desktop/Genomic_Data_Analysis/Data/yeast_fc_output.Rds")
fc <- readRDS(file = path_fc_object)
If you don’t have that file for any reason, the below code chunk will load a copy of it from Github.
if( !exists("fc") )
{
fc <- read_rds('https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/data/ethanol_stress/yeast_fc_output.Rds')
}
So far, we’ve been able to process all of the fastq files without much information about what each sample is in the experimental design. Now, we need the metadata for the samples. Note that the order matters for these files
To find the order of files we need, we can get just the part of the column name before the first “.” symbol with this command:
str_split_fixed(fc$counts %>% colnames(), "\\.", n = 2)[, 1]
## [1] "YPS606_MSN24_ETOH_REP1_R1" "YPS606_MSN24_ETOH_REP2_R1"
## [3] "YPS606_MSN24_ETOH_REP3_R1" "YPS606_MSN24_ETOH_REP4_R1"
## [5] "YPS606_MSN24_MOCK_REP1_R1" "YPS606_MSN24_MOCK_REP2_R1"
## [7] "YPS606_MSN24_MOCK_REP3_R1" "YPS606_MSN24_MOCK_REP4_R1"
## [9] "YPS606_WT_ETOH_REP1_R1" "YPS606_WT_ETOH_REP2_R1"
## [11] "YPS606_WT_ETOH_REP3_R1" "YPS606_WT_ETOH_REP4_R1"
## [13] "YPS606_WT_MOCK_REP1_R1" "YPS606_WT_MOCK_REP2_R1"
## [15] "YPS606_WT_MOCK_REP3_R1" "YPS606_WT_MOCK_REP4_R1"
sample_metadata <- tribble(
~Sample, ~Genotype, ~Condition,
"YPS606_MSN24_ETOH_REP1_R1", "msn24dd", "EtOH",
"YPS606_MSN24_ETOH_REP2_R1", "msn24dd", "EtOH",
"YPS606_MSN24_ETOH_REP3_R1", "msn24dd", "EtOH",
"YPS606_MSN24_ETOH_REP4_R1", "msn24dd", "EtOH",
"YPS606_MSN24_MOCK_REP1_R1", "msn24dd", "unstressed",
"YPS606_MSN24_MOCK_REP2_R1", "msn24dd", "unstressed",
"YPS606_MSN24_MOCK_REP3_R1", "msn24dd", "unstressed",
"YPS606_MSN24_MOCK_REP4_R1", "msn24dd", "unstressed",
"YPS606_WT_ETOH_REP1_R1", "WT", "EtOH",
"YPS606_WT_ETOH_REP2_R1", "WT", "EtOH",
"YPS606_WT_ETOH_REP3_R1", "WT", "EtOH",
"YPS606_WT_ETOH_REP4_R1", "WT", "EtOH",
"YPS606_WT_MOCK_REP1_R1", "WT", "unstressed",
"YPS606_WT_MOCK_REP2_R1", "WT", "unstressed",
"YPS606_WT_MOCK_REP3_R1", "WT", "unstressed",
"YPS606_WT_MOCK_REP4_R1", "WT", "unstressed") %>%
# Create a new column that combines the Genotype and Condition value
mutate(Group = factor(
paste(Genotype, Condition, sep = "."),
levels = c(
"WT.unstressed","WT.EtOH",
"msn24dd.unstressed", "msn24dd.EtOH"
)
)) %>%
# make Condition and Genotype a factor (with baseline as first level) for edgeR
mutate(
Genotype = factor(Genotype,
levels = c("WT", "msn24dd")),
Condition = factor(Condition,
levels = c("unstressed", "EtOH"))
)
Now, let’s create a design matrix with this information
group <- sample_metadata$Group
design <- model.matrix(~ 0 + group)
design
## groupWT.unstressed groupWT.EtOH groupmsn24dd.unstressed groupmsn24dd.EtOH
## 1 0 0 0 1
## 2 0 0 0 1
## 3 0 0 0 1
## 4 0 0 0 1
## 5 0 0 1 0
## 6 0 0 1 0
## 7 0 0 1 0
## 8 0 0 1 0
## 9 0 1 0 0
## 10 0 1 0 0
## 11 0 1 0 0
## 12 0 1 0 0
## 13 1 0 0 0
## 14 1 0 0 0
## 15 1 0 0 0
## 16 1 0 0 0
## attr(,"assign")
## [1] 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
colnames(design) <- levels(group)
design
## WT.unstressed WT.EtOH msn24dd.unstressed msn24dd.EtOH
## 1 0 0 0 1
## 2 0 0 0 1
## 3 0 0 0 1
## 4 0 0 0 1
## 5 0 0 1 0
## 6 0 0 1 0
## 7 0 0 1 0
## 8 0 0 1 0
## 9 0 1 0 0
## 10 0 1 0 0
## 11 0 1 0 0
## 12 0 1 0 0
## 13 1 0 0 0
## 14 1 0 0 0
## 15 1 0 0 0
## 16 1 0 0 0
## attr(,"assign")
## [1] 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
The count matrix is used to construct a DGEList class object. This is the main data class in the edgeR package. The DGEList object is used to store all the information required to fit a generalized linear model to the data, including library sizes and dispersion estimates as well as counts for each gene.
y <- DGEList(fc$counts, group=group)
colnames(y) <- sample_metadata$Sample
y$samples
## group lib.size norm.factors
## YPS606_MSN24_ETOH_REP1_R1 msn24dd.EtOH 195105 1
## YPS606_MSN24_ETOH_REP2_R1 msn24dd.EtOH 180513 1
## YPS606_MSN24_ETOH_REP3_R1 msn24dd.EtOH 165696 1
## YPS606_MSN24_ETOH_REP4_R1 msn24dd.EtOH 172157 1
## YPS606_MSN24_MOCK_REP1_R1 msn24dd.unstressed 137673 1
## YPS606_MSN24_MOCK_REP2_R1 msn24dd.unstressed 141652 1
## YPS606_MSN24_MOCK_REP3_R1 msn24dd.unstressed 171295 1
## YPS606_MSN24_MOCK_REP4_R1 msn24dd.unstressed 172884 1
## YPS606_WT_ETOH_REP1_R1 WT.EtOH 154719 1
## YPS606_WT_ETOH_REP2_R1 WT.EtOH 169239 1
## YPS606_WT_ETOH_REP3_R1 WT.EtOH 179811 1
## YPS606_WT_ETOH_REP4_R1 WT.EtOH 157740 1
## YPS606_WT_MOCK_REP1_R1 WT.unstressed 186150 1
## YPS606_WT_MOCK_REP2_R1 WT.unstressed 154835 1
## YPS606_WT_MOCK_REP3_R1 WT.unstressed 184436 1
## YPS606_WT_MOCK_REP4_R1 WT.unstressed 172568 1
Human-readable gene symbols can also be added to complement the gene ID for each gene, using the annotation in the org.Sc.sgd.db package.
y$genes <- AnnotationDbi::select(org.Sc.sgd.db,keys=rownames(y),columns="GENENAME")
## 'select()' returned 1:1 mapping between keys and columns
head(y$genes)
## ORF SGD GENENAME
## 1 YDL246C S000002405 SOR2
## 2 YDL243C S000002402 AAD4
## 3 YDR387C S000002795 CIN10
## 4 YDL094C S000002252 <NA>
## 5 YDR438W S000002846 THI74
## 6 YDR523C S000002931 SPS1
Genes with very low counts across all libraries provide little evidence for differential ex- pression. In addition, the pronounced discreteness of these counts interferes with some of the statistical approximations that are used later in the pipeline. These genes should be filtered out prior to further analysis. Here, we will retain a gene only if it is expressed at a count-per-million (CPM) above 60 in at least four samples.
keep <- rowSums(cpm(y) > 60) >= 4
y <- y[keep,]
summary(keep)
## Mode FALSE TRUE
## logical 4518 2609
Where did those cutoff numbers come from?
As a general rule, we don’t want to exclude a gene that is expressed in only one group, so a cutoff number equal to the number of replicates can be a good starting point. For counts, a good threshold can be chosen by identifying the CPM that corresponds to a count of 10, which in this case would be about 60 (due to our fastq files being subsets of the full reads):
cpm(10, mean(y$samples$lib.size))
## [,1]
## [1,] 59.3
Smaller CPM thresholds are usually appropriate for larger libraries.
TMM normalization is performed to eliminate composition biases between libraries. This generates a set of normalization factors, where the product of these factors and the library sizes defines the effective library size. The calcNormFactors function returns the DGEList argument with only the norm.factors changed.
y <- calcNormFactors(y)
y$samples
## group lib.size norm.factors
## YPS606_MSN24_ETOH_REP1_R1 msn24dd.EtOH 195105 1.156
## YPS606_MSN24_ETOH_REP2_R1 msn24dd.EtOH 180513 1.094
## YPS606_MSN24_ETOH_REP3_R1 msn24dd.EtOH 165696 1.074
## YPS606_MSN24_ETOH_REP4_R1 msn24dd.EtOH 172157 0.997
## YPS606_MSN24_MOCK_REP1_R1 msn24dd.unstressed 137673 1.038
## YPS606_MSN24_MOCK_REP2_R1 msn24dd.unstressed 141652 1.046
## YPS606_MSN24_MOCK_REP3_R1 msn24dd.unstressed 171295 0.999
## YPS606_MSN24_MOCK_REP4_R1 msn24dd.unstressed 172884 0.996
## YPS606_WT_ETOH_REP1_R1 WT.EtOH 154719 0.865
## YPS606_WT_ETOH_REP2_R1 WT.EtOH 169239 0.908
## YPS606_WT_ETOH_REP3_R1 WT.EtOH 179811 0.945
## YPS606_WT_ETOH_REP4_R1 WT.EtOH 157740 0.928
## YPS606_WT_MOCK_REP1_R1 WT.unstressed 186150 1.004
## YPS606_WT_MOCK_REP2_R1 WT.unstressed 154835 1.065
## YPS606_WT_MOCK_REP3_R1 WT.unstressed 184436 0.942
## YPS606_WT_MOCK_REP4_R1 WT.unstressed 172568 0.984
The normalization factors multiply to unity across all libraries. A normalization factor below unity indicates that the library size will be scaled down, as there is more suppression (i.e., composition bias) in that library relative to the other libraries. This is also equivalent to scaling the counts upwards in that sample. Conversely, a factor above unity scales up the library size and is equivalent to downscaling the counts. The performance of the TMM normalization procedure can be examined using mean- difference (MD) plots. This visualizes the library size-adjusted log-fold change between two libraries (the difference) against the average log-expression across those libraries (the mean). The below command plots an MD plot, comparing sample 1 against an artificial library constructed from the average of all other samples.
for (sample in 1:nrow(y$samples)) {
plotMD(cpm(y, log=TRUE), column=sample)
abline(h=0, col="red", lty=2, lwd=2)
}
The data can be explored by generating multi-dimensional scaling (MDS) plots. This visualizes the differences between the expression profiles of different samples in two dimensions. The next plot shows the MDS plot for the yeast heatshock data.
points <- c(1,1,2,2)
colors <- rep(c("black", "red"),8)
plotMDS(y, col=colors[group], pch=points[group])
legend("topright", legend=levels(group),
pch=points, col=colors, ncol=2)
The trended NB dispersion is estimated using the estimateDisp function. This returns the DGEList object with additional entries for the estimated NB dispersions for all genes. These estimates can be visualized with plotBCV, which shows the root-estimate, i.e., the biological coefficient of variation for each gene
y <- estimateDisp(y, design, robust=TRUE)
plotBCV(y)
In general, the trend in the NB dispersions should decrease smoothly with increasing abundance. This is because the expression of high-abundance genes is expected to be more stable than that of low-abundance genes. Any substantial increase at high abundances may be indicative of batch effects or trended biases. The value of the trended NB dispersions should range between 0.005 to 0.05 for laboratory-controlled biological systems like mice or cell lines, though larger values will be observed for patient-derived data (> 0.1)
For the QL dispersions, estimation can be performed using the glmQLFit function. This returns a DGEGLM object containing the estimated values of the GLM coefficients for each gene
fit <- glmQLFit(y, design, robust=TRUE)
head(fit$coefficients)
## WT.unstressed WT.EtOH msn24dd.unstressed msn24dd.EtOH
## YDR492W -9.12 -11.62 -9.12 -12.04
## YDR508C -7.49 -7.87 -7.57 -7.57
## YDR186C -9.88 -9.31 -9.85 -9.41
## YDR150W -8.42 -9.15 -8.36 -9.23
## YDL182W -7.60 -9.23 -7.79 -8.71
## YDR232W -8.25 -8.55 -8.50 -8.92
plotQLDisp(fit)
EB squeezing of the raw dispersion estimators towards the trend reduces the uncertainty of the final estimators. The extent of this moderation is determined by the value of the prior df, as estimated from the data. Large estimates for the prior df indicate that the QL dispersions are less variable between genes, meaning that stronger EB moderation can be performed. Small values for the prior df indicate that the dispersions are highly variable, meaning that strong moderation would be inappropriate
Setting robust=TRUE in glmQLFit is strongly recommended. This causes glmQLFit to estimate a vector of df.prior values, with lower values for outlier genes and larger values for the main body of genes.
The final step is to actually test for significant differential expression in each gene, using the QL F-test. The contrast of interest can be specified using the makeContrasts function. Here, genes are detected that are DE between the stressed and unstressed. This is done by defining the null hypothesis as heat stressed - unstressed = 0.
# generate contrasts we are interested in learning about
my.contrasts <- makeContrasts(EtOHvsMOCK.WT = WT.EtOH - WT.unstressed,
EtOHvsMOCK.MSN24dd = msn24dd.EtOH - msn24dd.unstressed,
EtOH.MSN24ddvsWT = msn24dd.EtOH - WT.EtOH,
MOCK.MSN24ddvsWT = msn24dd.unstressed - WT.unstressed,
EtOHvsWT.MSN24ddvsWT = (msn24dd.EtOH-msn24dd.unstressed)-(WT.EtOH-WT.unstressed),
levels=design)
# This contrast looks at the difference in the stress responses between mutant and WT
res <- glmQLFTest(fit, contrast = my.contrasts[,"EtOHvsWT.MSN24ddvsWT"])
# let's take a quick look at the results
topTags(res, n=10)
## Coefficient: 1*WT.unstressed -1*WT.EtOH -1*msn24dd.unstressed 1*msn24dd.EtOH
## ORF SGD GENENAME logFC logCPM F PValue FDR
## YKL035W YKL035W S000001518 UGP1 -4.03 10.87 356.2 2.37e-37 6.19e-34
## YPR149W YPR149W S000006353 NCE102 -3.92 8.09 172.8 8.17e-25 9.64e-22
## YDR343C YDR343C S000002751 HXT6 -5.49 8.15 207.2 1.11e-24 9.64e-22
## YPL004C YPL004C S000005925 LSP1 -2.71 8.96 118.6 1.64e-19 1.07e-16
## YDR077W YDR077W S000002484 SED1 -1.48 10.93 93.6 1.26e-16 6.56e-14
## YKL150W YKL150W S000001633 MCR1 -2.89 8.69 83.5 2.29e-15 9.95e-13
## YAL005C YAL005C S000000004 SSA1 -1.37 11.89 77.7 1.30e-14 4.86e-12
## YOL155C YOL155C S000005515 HPF1 -1.49 10.60 76.4 1.96e-14 6.38e-12
## YGR086C YGR086C S000003318 PIL1 -1.67 9.02 74.4 3.53e-14 1.02e-11
## YML100W YML100W S000004566 TSL1 -7.85 9.86 70.2 1.34e-13 3.51e-11
# generate a beautiful table for the pdf/html file.
topTags(res, n=Inf) %>% data.frame() %>%
arrange(FDR) %>%
mutate(logFC=round(logFC,2)) %>%
mutate(across(where(is.numeric), signif, 3)) %>%
reactable()
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.numeric), signif, 3)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
is.de <- decideTestsDGE(res, p.value=0.05)
summary(is.de)
## 1*WT.unstressed -1*WT.EtOH -1*msn24dd.unstressed 1*msn24dd.EtOH
## Down 134
## NotSig 2359
## Up 116
Let’s take a quick look at the differential expression
plotSmear(res, de.tags=rownames(res)[is.de!=0])
We need to make sure and save our output file(s).
# Choose topTags destination
dir_output_edgeR <-
path.expand("~/Desktop/Genomic_Data_Analysis/Analysis/edgeR/")
if (!dir.exists(dir_output_edgeR)) {
dir.create(dir_output_edgeR, recursive = TRUE)
}
# for shairng with others, the topTags output is convenient.
topTags(res, n = Inf) %>% data.frame() %>%
arrange(desc(logFC)) %>%
mutate(logFC = round(logFC, 2)) %>%
mutate(across(where(is.numeric), signif, 3)) %>%
write_tsv(., file = paste0(dir_output_edgeR, "yeast_topTags_edgeR.tsv"))
# for subsequent analysis, let's save the res object as an R data object.
saveRDS(object = res, file = paste0(dir_output_edgeR, "yeast_res_edgeR.Rds"))
Be sure to knit this file into a pdf or html file once you’re finished.
System information for reproducibility:
pander::pander(sessionInfo())
R version 4.2.2 (2022-10-31)
Platform: aarch64-apple-darwin20 (64-bit)
locale: en_US.UTF-8||en_US.UTF-8||en_US.UTF-8||C||en_US.UTF-8||en_US.UTF-8
attached base packages: stats4, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: org.Sc.sgd.db(v.3.16.0), AnnotationDbi(v.1.60.2), IRanges(v.2.32.0), S4Vectors(v.0.36.2), Biobase(v.2.58.0), BiocGenerics(v.0.44.0), edgeR(v.3.40.2), limma(v.3.54.2), reactable(v.0.4.4), BiocManager(v.1.30.21.1), pander(v.0.6.5), knitr(v.1.43), lubridate(v.1.9.2), forcats(v.1.0.0), stringr(v.1.5.0), dplyr(v.1.1.2), purrr(v.1.0.1), readr(v.2.1.4), tidyr(v.1.3.0), tibble(v.3.2.1), ggplot2(v.3.4.2), tidyverse(v.2.0.0) and pacman(v.0.5.1)
loaded via a namespace (and not attached): httr(v.1.4.6), sass(v.0.4.7), vroom(v.1.6.3), splines(v.4.2.2), bit64(v.4.0.5), jsonlite(v.1.8.7), bslib(v.0.5.0), statmod(v.1.5.0), highr(v.0.10), blob(v.1.2.4), GenomeInfoDbData(v.1.2.9), yaml(v.2.3.7), pillar(v.1.9.0), RSQLite(v.2.3.1), lattice(v.0.21-8), glue(v.1.6.2), digest(v.0.6.33), XVector(v.0.38.0), colorspace(v.2.1-0), htmltools(v.0.5.5), reactR(v.0.4.4), pkgconfig(v.2.0.3), zlibbioc(v.1.44.0), scales(v.1.2.1), tzdb(v.0.4.0), timechange(v.0.2.0), KEGGREST(v.1.38.0), generics(v.0.1.3), ellipsis(v.0.3.2), cachem(v.1.0.8), withr(v.2.5.0), cli(v.3.6.1), magrittr(v.2.0.3), crayon(v.1.5.2), memoise(v.2.0.1), evaluate(v.0.21), fansi(v.1.0.4), tools(v.4.2.2), hms(v.1.1.3), lifecycle(v.1.0.3), munsell(v.0.5.0), locfit(v.1.5-9.8), Biostrings(v.2.66.0), compiler(v.4.2.2), jquerylib(v.0.1.4), GenomeInfoDb(v.1.34.9), rlang(v.1.1.1), grid(v.4.2.2), RCurl(v.1.98-1.12), rstudioapi(v.0.15.0), htmlwidgets(v.1.6.2), crosstalk(v.1.2.0), bitops(v.1.0-7), rmarkdown(v.2.23), gtable(v.0.3.3), DBI(v.1.1.3), R6(v.2.5.1), fastmap(v.1.1.1), bit(v.4.0.5), utf8(v.1.2.3), stringi(v.1.7.12), parallel(v.4.2.2), Rcpp(v.1.0.11), vctrs(v.0.6.3), png(v.0.1-8), tidyselect(v.1.2.0) and xfun(v.0.39)